import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from scipy.spatial import distance_matrix
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, ShuffleSplit, LeaveOneOut
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pydot import graph_from_dot_data
from sklearn import metrics
from sklearn.decomposition import PCA
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
np.random.seed(66)
pd.set_option('display.max_columns', 999)
Essay_Tree = pd.read_csv('/Users/yashpasar/Downloads/Disputed_Essay_data.csv')
print('Columns with null values:' ,sum(list(Essay_Tree.isnull().any())))
cv_Essay_Tree = Essay_Tree.loc[Essay_Tree['author'].isin(['Hamilton', 'Madison']), :]
test_Essay_Tree = Essay_Tree.loc[Essay_Tree['author'].isin(['dispt','HM']), :]
cv_Essay_Tree.shape, test_Essay_Tree.shape
min_max_scaler = preprocessing.MinMaxScaler()
features = cv_Essay_Tree.iloc[:, 2:72].values
features = min_max_scaler.fit_transform(features)
label = cv_Essay_Tree.iloc[:, 0].values
labels = [0 if i=='Hamilton' else 1 for i in label]
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.3, random_state=1) # 70% training and 30% test
kmeans = KMeans(n_clusters=2, n_init=25, max_iter=100, random_state=6)
kmeans.fit(X_train)
train_pred = kmeans.predict(X_train)
print(classification_report(y_train, train_pred, target_names = ['Hamilton', 'Madison']))
valid_pred = kmeans.predict(X_valid)
print(classification_report(y_valid, valid_pred, target_names = ['Hamilton', 'Madison']))
from scipy.spatial.distance import cdist
distortions = []
inertias = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X_train)
kmeanModel.fit(X_train)
distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_train.shape[0])
inertias.append(kmeanModel.inertia_)
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method to Identify Optimal k')
plt.show()
distortions
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()
inertias
#Principal component separation to create 2 dim picture
pca=PCA(n_components=2)
train_PCA = pca.fit_transform(X_train)
train_PCA_1 = train_PCA[:, 0]
train_PCA_2 = train_PCA[:, 1]
train_pred_df = pd.DataFrame({'pc1':train_PCA_1, 'pc2':train_PCA_2, 'Prediction': train_pred })
train_pred_df.head()
%matplotlib inline
trace0= go.Scatter(x=train_pred_df[train_pred_df.Prediction == 0]['pc1'],
y=train_pred_df[train_pred_df.Prediction == 0]['pc2'],
name="Prediction for Cluster 0",
mode ="markers",
marker =dict(size=10,color="rgba(15,152,152,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
trace1= go.Scatter(x=train_pred_df[train_pred_df.Prediction == 1]['pc1'],
y=train_pred_df[train_pred_df.Prediction == 1]['pc2'],
name="Prediction for Cluster 1",
mode ="markers",
marker =dict(size=10,color="rgba(180,18,180,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
fig = go.Figure()
fig.add_trace(trace0)
fig.add_trace(trace1)
fig.show(renderer="notebook")
#Principal component separation to create 2 dim picture
valid_PCA = pca.fit_transform(X_valid)
valid_PCA_1 = valid_PCA[:, 0]
valid_PCA_2 = valid_PCA[:, 1]
valid_pred_df = pd.DataFrame({'pc1':valid_PCA_1, 'pc2':valid_PCA_2, 'Prediction': valid_pred })
valid_pred_df.head()
%matplotlib inline
trace0= go.Scatter(x=valid_pred_df[valid_pred_df.Prediction == 0]['pc1'],
y=valid_pred_df[valid_pred_df.Prediction == 0]['pc2'],
name="Prediction for Cluster 0",
mode ="markers",
marker =dict(size=10,color="rgba(15,152,152,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
trace1= go.Scatter(x=valid_pred_df[valid_pred_df.Prediction == 1]['pc1'],
y=valid_pred_df[valid_pred_df.Prediction == 1]['pc2'],
name="Prediction for Cluster 1",
mode ="markers",
marker =dict(size=10,color="rgba(180,18,180,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
fig = go.Figure()
fig.add_trace(trace0)
fig.add_trace(trace1)
fig.show(renderer="notebook")
from scipy.cluster.hierarchy import ward, dendrogram, cut_tree
import scipy.spatial.distance as ssd
# convert the redundant n*n square matrix form into a condensed nC2 array
linkage_matrix = ward(features)
fig, ax = plt.subplots(figsize=(20, 10))
ax.grid(False)
ax.set_title('Cluster Dendrogram', fontsize = 25)
ax = dendrogram(linkage_matrix, orientation='top', labels=label)
plt.xticks(fontsize=15)
plt.show()
hac = AgglomerativeClustering()
pred = hac.fit_predict(features)
print(classification_report(labels, pred, target_names = ['Hamilton', 'Madison']))
cv_Essay_Tree.shape, test_Essay_Tree.shape
features = cv_Essay_Tree.iloc[:, 2:].values
features = min_max_scaler.fit_transform(features)
label = cv_Essay_Tree.iloc[:, 0].values
labels = [0 if i=='Hamilton' else 1 for i in label]
X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.3, random_state=1) # 70% training and 30% test
clf = DecisionTreeClassifier(random_state = 25)
clf.fit(X_train, y_train)
pred = clf.predict(X_valid)
print(classification_report(y_valid, pred, target_names = ['Hamilton', 'Madison']))
clf.tree_.max_depth
print(f"Accuracy: {round(metrics.accuracy_score(y_valid, pred)*100)}%")
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = Essay_Tree.columns[1:71])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Essay_Tree.png')
Image(graph.create_png())
param_grid = {'criterion': ['gini', 'entropy'],
'min_samples_split': [2, 10, 20],
'max_depth': [5, 10, 20, 25, 30],
'min_samples_leaf': [1, 5, 10],
'max_leaf_nodes': [2, 5, 10, 20]}
grid = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_score_)
for hps, values in grid.best_params_.items():
print(f"{hps}: {values}")
clf = DecisionTreeClassifier(random_state = 25, criterion='gini', max_depth = 5, max_leaf_nodes = 2, min_samples_leaf = 1, min_samples_split = 2 )
clf.fit(X_train, y_train)
pred = clf.predict(X_valid)
print(classification_report(y_valid, pred, target_names = ['Hamilton', 'Madison']))
print(f"Accuracy: {round(metrics.accuracy_score(y_valid, pred)*100)}%")
x_test = test_Essay_Tree.iloc[:, 2:]
x_test = min_max_scaler.fit_transform(x_test)
y_test = test_Essay_Tree.iloc[:, 0].values
fPCA = pca.fit_transform(features)
PCA_1 = fPCA[:, 0]
PCA_2 = fPCA[:, 1]
PCA_df = pd.DataFrame({'pc1':PCA_1, 'pc2':PCA_2, 'label': label })
PCA_df.head()
#Principal component separation to create 2 dim picture
test_PCA = pca.fit_transform(x_test)
test_PCA_1 = list(test_PCA[:, 0])
test_PCA_2 = list(test_PCA[:, 1])
new_PCA_1 = list(PCA_1) + test_PCA_1
new_PCA_2 = list(PCA_2) + test_PCA_2
new_label = list(label) + list(y_test)
test_pred_df = pd.DataFrame({'pc1':new_PCA_1, 'pc2':new_PCA_2, 'Label': new_label})
test_pred_df.Label.unique()
%matplotlib inline
trace0= go.Scatter(x=test_pred_df[test_pred_df.Label == 'Hamilton']['pc1'],
y=test_pred_df[test_pred_df.Label == 'Hamilton']['pc2'],
name="Hamilton's Cluster",
mode ="markers",
marker =dict(size=10,color="rgba(15,152,152,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
trace1= go.Scatter(x=test_pred_df[test_pred_df.Label == 'Madison']['pc1'],
y=test_pred_df[test_pred_df.Label == 'Madison']['pc2'],
name="Madison's Cluster",
mode ="markers",
marker =dict(size=10,color="rgba(610,18,180,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
trace2= go.Scatter(x=test_pred_df[test_pred_df.Label == 'dispt']['pc1'],
y=test_pred_df[test_pred_df.Label == 'dispt']['pc2'],
name="Disputed's Cluster",
mode ="markers",
marker =dict(size=10,color="rgba(150,12,17,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
trace3= go.Scatter(x=test_pred_df[test_pred_df.Label == 'HM']['pc1'],
y=test_pred_df[test_pred_df.Label == 'HM']['pc2'],
name="HM's Cluster",
mode ="markers",
marker =dict(size=10,color="rgba(252,252,332,0.5)",line=dict(width=1,color="rgb(0,0,0)")))
fig = go.Figure()
fig.add_trace(trace0)
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.add_trace(trace3)
fig.show(renderer="notebook")
test_pred_Essay_Tree_KM = clf.predict(x_test)
pred_label_Essay_Tree_KM = ['Hamilton' if i==0 else 'Madison' for i in test_pred_Essay_Tree_KM]
pd.DataFrame({'Author': test_Essay_Tree.iloc[:,0].values, 'Predicted Author using K Means' : pred_label_Essay_Tree_KM})
test_pred_Essay_Tree_DT = clf.predict(x_test)
pred_label_Essay_Tree_DT = ['Hamilton' if i==0 else 'Madison' for i in test_pred_Essay_Tree_DT]
pd.DataFrame({'Author': test_Essay_Tree.iloc[:,0].values, 'Predicted Author using Decision Tree' : pred_label_Essay_Tree_DT})